# This script prepares the different oversampled and undersampled training datasets to develop the PRS-only models. 
# These PRS-only training datasets prepared in this script will have had the following optimisation techniques applied: ADASYN oversampling and/or random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "PRSonly_model_641ID_standardised_training_dataset.csv" is found in IOWBC_PRS_data.xlsx, sheet: "PRS standardised training set"
# The data in files named "PRS_only_standardised_oversampled_training_dataset_XXX.csv" were developed using the script "Data_preparation_PRS_oversampling.txt (data can be found in IOWBC_PRS_data.xlsx).
# Python version 3.6.8 was used 

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

# Set working directory
os.chdir("/../../")

#######################
### Import datasets ###
#######################
# Construct both oversampled and undersampled datasets in the following way:
# data_0 = complete case		data_0_U = complete case, undersampled
# data_25_O = 25% oversampled cases		data_25_OU = 25% oversampled cases, undersampled controls to 1:1 class ratio

data_0 = pd.read_csv("PRSonly_model_641ID_standardised_training_dataset.csv", index_col=False)
del data_0['Unnamed: 0']
data_0.rename(columns={'IID':'Study_ID'}, inplace=True)
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 103})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:103,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 103, 1: 103})


data_25_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_25%.csv", index_col=False)
data_25_O = data_25_O.iloc[0:667,:]
print('Original dataset shape %s' % Counter(data_25_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 129})

# Undersample the controls 
s1 = data_25_O.loc[data_25_O['Asthma_10YR'] == 1]
s0 = data_25_O.loc[data_25_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:129,]
data_25_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25_OU = shuffle(data_25_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_25_OU.Asthma_10YR))
# Original dataset shape Counter({0: 129, 1: 129})


data_50_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_50%.csv", index_col=False)
data_50_O = data_50_O.iloc[0:693,:]
print('Original dataset shape %s' % Counter(data_50_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 155})

# Undersample the controls 
s1 = data_50_O.loc[data_50_O['Asthma_10YR'] == 1]
s0 = data_50_O.loc[data_50_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:155,]
data_50_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50_OU = shuffle(data_50_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_50_OU.Asthma_10YR))
# Original dataset shape Counter({0: 155, 1: 155})


data_100_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_100%.csv", index_col=False)
data_100_O = data_100_O.iloc[0:744,:]
print('Original dataset shape %s' % Counter(data_100_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 206})

# Undersample the controls 
s1 = data_100_O.loc[data_100_O['Asthma_10YR'] == 1]
s0 = data_100_O.loc[data_100_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:206,]
data_100_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100_OU = shuffle(data_100_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_100_OU.Asthma_10YR))
# Original dataset shape Counter({0: 206, 1: 206})


data_150_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_150%.csv", index_col=False)
data_150_O = data_150_O.iloc[0:796,:]
print('Original dataset shape %s' % Counter(data_150_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 258})

# Undersample the controls 
s1 = data_150_O.loc[data_150_O['Asthma_10YR'] == 1]
s0 = data_150_O.loc[data_150_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:258,]
data_150_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150_OU = shuffle(data_150_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_150_OU.Asthma_10YR))
# Original dataset shape Counter({0: 258, 1: 258})


data_200_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_200%.csv", index_col=False)
data_200_O = data_200_O.iloc[0:847,:]
print('Original dataset shape %s' % Counter(data_200_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 309})

# Undersample the controls 
s1 = data_200_O.loc[data_200_O['Asthma_10YR'] == 1]
s0 = data_200_O.loc[data_200_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:309,]
data_200_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200_OU = shuffle(data_200_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_200_OU.Asthma_10YR))
# Original dataset shape Counter({0: 309, 1: 309})


data_250_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_250%.csv", index_col=False)
data_250_O = data_250_O.iloc[0:899,:]
print('Original dataset shape %s' % Counter(data_250_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 361})

# Undersample the controls 
s1 = data_250_O.loc[data_250_O['Asthma_10YR'] == 1]
s0 = data_250_O.loc[data_250_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:361,]
data_250_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250_OU = shuffle(data_250_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_250_OU.Asthma_10YR))
# Original dataset shape Counter({0: 361, 1: 361})


data_300_O = pd.read_csv("PRS_only_standardised_oversampled_training_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:950,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 538, 1: 412})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:412,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 412, 1: 412}


# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25_O);
data.append(data_50_O);
data.append(data_100_O);
data.append(data_150_O);
data.append(data_200_O);
data.append(data_250_O);
data.append(data_300_O);
data.append(data_0_U);
data.append(data_25_OU);
data.append(data_50_OU);
data.append(data_100_OU);
data.append(data_150_OU);
data.append(data_200_OU);
data.append(data_250_OU);
data.append(data_300_OU)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset. 
set = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

# Import standardised PRS test data - data found in IOWBC_PRS_data.xlsx, sheet: "PRS standardised test set"
test = pd.read_csv("PRSonly_model_267ID_standardised_test_dataset.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['IID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']


